Skip to content

Commit

Permalink
cleaned up code and fixed issue on windows computers
Browse files Browse the repository at this point in the history
  • Loading branch information
aagrawl3 committed Dec 14, 2016
1 parent 775c203 commit 024ce64
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 86 deletions.
80 changes: 4 additions & 76 deletions RegressionTree.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,24 +47,14 @@ def find_best_split(data, label, split_points):
best_ls = 1000000
best_split = None
best_children = None
print len(split_points.items())
pool = Pool()
for ls, split, children in pool.map(find_best_split_parallel, zip(split_points.items(), repeat(data), repeat(label))):
if ls < best_ls:
best_ls = ls
best_split = split
best_children = children
pool.close()
# non parallel code
# for key,possible_split in split_points.items():
# for split in possible_split:
# children = split_children(data, label, key, split)

# #weighted average of left and right ls
# ls = float(len(children[1]))/len(label)*least_square(children[1]) + float(len(children[3]))/len(label)*least_square(children[3])
# if ls < best_ls:
# best_ls = ls
# best_split = (key, split)




Expand All @@ -81,13 +71,6 @@ def split_children(data, label, key, split):
return left_data, left_label, right_data, right_label

def least_square(label):
# # given the
# # return the
# if not len(label):
# return 0
# # label = np.array(label).astype(float)
# return len(label) * np.var(label)
# # return np.sum((label - np.mean(label))**2)
if not len(label):
return 0
return (np.sum(label)**2)/len(set(label))
Expand All @@ -102,8 +85,6 @@ def create_leaf(label):
'is_leaf':True,
'index':node_id}
leaf['value'] = round(np.mean(label),3)
# print 'val: ' + str(leaf['value'])
# print label
return leaf

def find_splits_parallel(args):
Expand All @@ -120,14 +101,10 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
remaining_features = all_pos_split
#stopping conditions
if sum([len(v)!= 0 for v in remaining_features.values()]) == 0:
print current_depth
print "stop because no more features."
# If there are no remaining features to consider, make current node a leaf node
return create_leaf(label)
# #Additional stopping condition (limit tree depth)
elif current_depth > max_depth:
print current_depth
print "reached max depth, stop."
return create_leaf(label)


Expand All @@ -138,7 +115,6 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =

var_spaces = [data.iloc[:,col].tolist() for col in xrange(data.shape[1])]
cols = [col for col in xrange(data.shape[1])]
print 'find best split'
pool = Pool()
for split, error, ierr, numf in pool.map(find_splits_parallel, zip(var_spaces, repeat(label), cols)):
if not min_error or error < min_error:
Expand All @@ -147,46 +123,19 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
min_split = split
pool.close()

# for col in xrange(data.shape[1]):
# var_space = data.iloc[:,col].tolist()
# split, error, ierr, numf = scipy.optimize.fminbound(error_function, min(var_space), max(var_space), args = (col, data.values.tolist(), label), full_output = 1)
# if not min_error or error < min_error:
# min_error = error
# split_var = col
# min_split = split
splitting_feature = (split_var, min_split)
#######
children = split_children(data, label, split_var, min_split)
# print "children:", children
print 'found best split'
# exit()

# # find splitting features
# print 'find best_split'
# splitting_feature, children = find_best_split(data, label, remaining_features) #tuple(id, value)
# print 'found best split'
# # print "split feature: ", splitting_feature
# # remove current features
# remaining_features[splitting_feature[0]].remove(splitting_feature[1])

left_data, left_label, right_data, right_label = children
if len(left_label) == 0 or len(right_label) == 0:
print current_depth
print "One side is empty"
return create_leaf(label)
# print 'left label: ' + str(left_label)

left_least_square = least_square(left_label)

# Create a leaf node if the split is "perfect"
if left_least_square < ideal_ls:
# print "current left ls: ", least_square(left_label), left_label
print current_depth
print "left stop because less than ls ."
return create_leaf(left_label)
if least_square(right_label) < ideal_ls:
# print "right ls: ", least_square(right_label),right_label
print current_depth
print "right stop becasue less than ls."
return create_leaf(right_label)

# recurse on children
Expand All @@ -200,12 +149,6 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
'index' : None}

def error_function(split_point, split_var, data, label):
# left_index = [index for index in xrange(len(data.iloc[:,split_var])) if data.iloc[index,split_var] < split_point]
# right_index = [index for index in xrange(len(data.iloc[:,split_var])) if data.iloc[index,split_var] >= split_point]
# left_data = data.iloc[left_index,:]
# right_data = data.iloc[right_index,:]
# left_label = [label[i] for i in left_index]
# right_label =[label[i] for i in right_index]
data1 = []
data2 = []
for i in xrange(len(data)):
Expand All @@ -221,7 +164,7 @@ def make_prediction(tree, x, annotate = False):
if tree['is_leaf']:
if annotate:
print "At leaf, predicting %s" % tree['value']
return tree['value']#tree['index']#, tree['value']
return tree['value']
else:
# the splitting value of x.
split_feature_value = x[tree['splitting_feature'][0]]
Expand Down Expand Up @@ -250,7 +193,6 @@ def fit(self):
for dat, col in pool.map(get_splitting_points, zip(splitting_data, cols)):
all_pos_split[col] = dat
pool.close()
print 'actually making trees'
self.tree = create_tree(self.training_data, all_pos_split, self.labels, self.max_depth, self.ideal_ls)


Expand All @@ -268,18 +210,4 @@ def predict(self, test):
model = RegressionTree(data, label)
model.fit()
print model.predict(test)

# all_pos_split = {}
# pool = Pool()
# splitting_data = [data.iloc[:,col].tolist() for col in xrange(data.shape[1])]
# cols = [col for col in xrange(data.shape[1])]
# for dat, col in pool.map(get_splitting_points, zip(splitting_data, cols)):
# all_pos_split[col] = dat
# pool.close()

# # non parallel code
# # for col in range(data.shape[1]):
# # all_pos_split[col] = get_splitting_points(data.iloc[:,col].tolist())

# tree = create_tree(data, all_pos_split, label, current_depth = 0)
# print output(tree, test)

20 changes: 10 additions & 10 deletions lambdamart.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def group_queries(training_data, qid_index):
Parameters
----------
training_data : Numpy array of lists
Contains a list of document information. Each documents format is [relevance score, query index, feature vector]
Contains a list of document information. Each document's format is [relevance score, query index, feature vector]
qid_index : int
This is the index where the qid is located in the training data
Expand Down Expand Up @@ -214,9 +214,9 @@ def __init__(self, training_data=None, number_of_trees=5, learning_rate=0.1, tre
Number of trees LambdaMART goes through
learning_rate : float (default: 0.1)
Rate at which we update our prediction with each tree
tree_type : string (default: sklearn)
Either sklearn for using Sklearn implementation of the tree or “original” for using
our implementation of the tree.
tree_type : string (default: "sklearn")
Either "sklearn" for using Sklearn implementation of the tree of "original"
for using our implementation
"""

if tree_type != 'sklearn' and tree_type != 'original':
Expand Down Expand Up @@ -276,7 +276,7 @@ def predict(self, data):
Parameters
----------
data : Numpy array of documents
Numpy array of documents with each documents format is [query index, feature vector]
Numpy array of documents with each document's format is [query index, feature vector]
Returns
-------
Expand All @@ -299,7 +299,7 @@ def validate(self, data, k):
Parameters
----------
data : Numpy array of documents
Numpy array of documents with each documents format is [relevance score, query index, feature vector]
Numpy array of documents with each document's format is [relevance score, query index, feature vector]
k : int
this is used to compute the NDCG@k
Expand Down Expand Up @@ -331,25 +331,25 @@ def validate(self, data, k):

def save(self, fname):
"""
Saves the model into a .lmart file with the name given as a parameter.
Saves the model into a ".lmart" file with the name given as a parameter.
Parameters
----------
fname : string
Filename of the file you want to save
"""
pickle.dump(self, open('%s.lmart' % (fname), "w"), protocol=2)
pickle.dump(self, open('%s.lmart' % (fname), "wb"), protocol=2)

def load(self, fname):
"""
Loads the model from the .lmart file given as a parameter.
Loads the model from the ".lmart" file given as a parameter.
Parameters
----------
fname : string
Filename of the file you want to load
"""
model = pickle.load(open(fname , "r"))
model = pickle.load(open(fname , "rb"))
self.training_data = model.training_data
self.number_of_trees = model.number_of_trees
self.tree_type = model.tree_type
Expand Down

0 comments on commit 024ce64

Please sign in to comment.