Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
avoca-dorable authored Dec 13, 2016
1 parent e36ceea commit 60a1bea
Showing 1 changed file with 110 additions and 6 deletions.
116 changes: 110 additions & 6 deletions RegressionTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,53 @@ def get_splitting_points(args):
possible_split.append(np.mean((attribute[i],attribute[i+1])))
return possible_split, col

# create a dictionary, key is the attribute number, value is whole list of possible splits for that column
def find_best_split_parallel(args):
best_ls = 1000000
best_split = None
best_children = None
split_point, data, label = args
key,possible_split = split_point

for split in possible_split:
children = split_children(data, label, key, split)

#weighted average of left and right ls
ls = len(children[1])*least_square(children[1])/len(label) + len(children[3])*least_square(children[3])/len(label)
if ls < best_ls:
best_ls = ls
best_split = (key, split)
best_children = children
return best_ls, best_split, best_children

def find_best_split(data, label, split_points):
# split_points is a dictionary of possible splitting values
# return the best split
best_ls = 1000000
best_split = None
best_children = None
print len(split_points.items())
pool = Pool()
for ls, split, children in pool.map(find_best_split_parallel, zip(split_points.items(), repeat(data), repeat(label))):
if ls < best_ls:
best_ls = ls
best_split = split
best_children = children
pool.close()
# non parallel code
# for key,possible_split in split_points.items():
# for split in possible_split:
# children = split_children(data, label, key, split)

# #weighted average of left and right ls
# ls = float(len(children[1]))/len(label)*least_square(children[1]) + float(len(children[3]))/len(label)*least_square(children[3])
# if ls < best_ls:
# best_ls = ls
# best_split = (key, split)



return best_split, best_children # return a tuple(attribute, value)

def split_children(data, label, key, split):
left_index = [index for index in xrange(len(data.iloc[:,key])) if data.iloc[index,key] < split]
Expand All @@ -34,7 +81,13 @@ def split_children(data, label, key, split):
return left_data, left_label, right_data, right_label

def least_square(label):

# # given the
# # return the
# if not len(label):
# return 0
# # label = np.array(label).astype(float)
# return len(label) * np.var(label)
# # return np.sum((label - np.mean(label))**2)
if not len(label):
return 0
return (np.sum(label)**2)/len(set(label))
Expand All @@ -49,12 +102,19 @@ def create_leaf(label):
'is_leaf':True,
'index':node_id}
leaf['value'] = round(np.mean(label),3)
# print 'val: ' + str(leaf['value'])
# print label
return leaf

def find_splits_parallel(args):
var_space, label, col = args
# var_space = data.iloc[:,col].tolist()
return scipy.optimize.fminbound(error_function, min(var_space), max(var_space), args = (col, var_space, label), full_output = 1)

# return,
# if not min_error or error < min_error:
# min_error = error
# split_var = col
# min_split = split

def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth = 0):
remaining_features = all_pos_split
Expand All @@ -71,12 +131,14 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
return create_leaf(label)


#######
min_error = None
split_var = None
min_split = None

var_spaces = [data.iloc[:,col].tolist() for col in xrange(data.shape[1])]
cols = [col for col in xrange(data.shape[1])]
print 'find best split'
pool = Pool()
for split, error, ierr, numf in pool.map(find_splits_parallel, zip(var_spaces, repeat(label), cols)):
if not min_error or error < min_error:
Expand All @@ -85,11 +147,33 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
min_split = split
pool.close()

# for col in xrange(data.shape[1]):
# var_space = data.iloc[:,col].tolist()
# split, error, ierr, numf = scipy.optimize.fminbound(error_function, min(var_space), max(var_space), args = (col, data.values.tolist(), label), full_output = 1)
# if not min_error or error < min_error:
# min_error = error
# split_var = col
# min_split = split
splitting_feature = (split_var, min_split)
#######
children = split_children(data, label, split_var, min_split)

# print "children:", children
print 'found best split'
# exit()

# # find splitting features
# print 'find best_split'
# splitting_feature, children = find_best_split(data, label, remaining_features) #tuple(id, value)
# print 'found best split'
# # print "split feature: ", splitting_feature
# # remove current features
# remaining_features[splitting_feature[0]].remove(splitting_feature[1])
left_data, left_label, right_data, right_label = children
if len(left_label) == 0 or len(right_label) == 0:
print current_depth
print "One side is empty"
return create_leaf(label)
# print 'left label: ' + str(left_label)

left_least_square = least_square(left_label)

Expand All @@ -116,6 +200,12 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
'index' : None}

def error_function(split_point, split_var, data, label):
# left_index = [index for index in xrange(len(data.iloc[:,split_var])) if data.iloc[index,split_var] < split_point]
# right_index = [index for index in xrange(len(data.iloc[:,split_var])) if data.iloc[index,split_var] >= split_point]
# left_data = data.iloc[left_index,:]
# right_data = data.iloc[right_index,:]
# left_label = [label[i] for i in left_index]
# right_label =[label[i] for i in right_index]
data1 = []
data2 = []
for i in xrange(len(data)):
Expand All @@ -131,7 +221,7 @@ def make_prediction(tree, x, annotate = False):
if tree['is_leaf']:
if annotate:
print "At leaf, predicting %s" % tree['value']
return tree['index']#, tree['value']
return tree['value']#tree['index']#, tree['value']
else:
# the splitting value of x.
split_feature_value = x[tree['splitting_feature'][0]]
Expand Down Expand Up @@ -165,7 +255,7 @@ def fit(self):


def predict(self, test):
prediction = [make_prediction(self.tree, x) for x in test]
prediction = np.array([make_prediction(self.tree, x) for x in test])
return prediction

if __name__ == '__main__':
Expand All @@ -178,4 +268,18 @@ def predict(self, test):
model = RegressionTree(data, label)
model.fit()
print model.predict(test)


# all_pos_split = {}
# pool = Pool()
# splitting_data = [data.iloc[:,col].tolist() for col in xrange(data.shape[1])]
# cols = [col for col in xrange(data.shape[1])]
# for dat, col in pool.map(get_splitting_points, zip(splitting_data, cols)):
# all_pos_split[col] = dat
# pool.close()

# # non parallel code
# # for col in range(data.shape[1]):
# # all_pos_split[col] = get_splitting_points(data.iloc[:,col].tolist())

# tree = create_tree(data, all_pos_split, label, current_depth = 0)
# print output(tree, test)

0 comments on commit 60a1bea

Please sign in to comment.