cleaned up code and fixed issue on windows computers

lezzago · Dec 14, 2016 · 024ce64 · 024ce64
1 parent 775c203
commit 024ce64
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 86 deletions.
diff --git a/RegressionTree.py b/RegressionTree.py
@@ -47,24 +47,14 @@ def find_best_split(data, label, split_points):
 	best_ls = 1000000
 	best_split = None
 	best_children = None
-	print len(split_points.items())
 	pool = Pool()
 	for ls, split, children in pool.map(find_best_split_parallel, zip(split_points.items(), repeat(data), repeat(label))):
 		if ls < best_ls:
 			best_ls = ls
 			best_split = split
 			best_children = children
 	pool.close()
-	# non parallel code
-	# for key,possible_split in split_points.items():
-	# 	for split in possible_split:
-	# 		children = split_children(data, label, key, split)
-
-	# 		#weighted average of left and right ls
-	# 		ls  = float(len(children[1]))/len(label)*least_square(children[1]) + float(len(children[3]))/len(label)*least_square(children[3])
-	# 		if ls < best_ls:
-	# 			best_ls = ls
-	# 			best_split = (key, split)
+
 
 
 
@@ -81,13 +71,6 @@ def split_children(data, label, key, split):
 	return left_data, left_label, right_data, right_label 
 
 def least_square(label):
-	# # given the 
-	# # return the
-	# if not len(label):
-	# 	return 0
-	# # label = np.array(label).astype(float)
-	# return len(label) * np.var(label)
-	# # return np.sum((label - np.mean(label))**2)
 	if not len(label):
 		return 0
 	return (np.sum(label)**2)/len(set(label))
@@ -102,8 +85,6 @@ def create_leaf(label):
 			'is_leaf':True,
 			'index':node_id}
 	leaf['value'] = round(np.mean(label),3)
-	# print 'val: ' + str(leaf['value'])
-	# print label
 	return leaf
 
 def find_splits_parallel(args):
@@ -120,14 +101,10 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
 	remaining_features = all_pos_split
 	#stopping conditions
 	if sum([len(v)!= 0 for v in remaining_features.values()]) == 0:
-		print current_depth
-		print "stop because no more features."    
 		# If there are no remaining features to consider, make current node a leaf node
 		return create_leaf(label)    
 	# #Additional stopping condition (limit tree depth)
 	elif current_depth > max_depth:
-		print current_depth
-		print "reached max depth, stop."
 		return create_leaf(label)
 
 
@@ -138,7 +115,6 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
 
 	var_spaces = [data.iloc[:,col].tolist() for col in xrange(data.shape[1])]
 	cols = [col for col in xrange(data.shape[1])]
-	print 'find best split'
 	pool = Pool()
 	for split, error, ierr, numf in pool.map(find_splits_parallel, zip(var_spaces, repeat(label), cols)):
 		if not min_error or error < min_error:
@@ -147,46 +123,19 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
 			min_split = split
 	pool.close()
 
-	# for col in xrange(data.shape[1]):
-	# 	var_space = data.iloc[:,col].tolist()
-	# 	split, error, ierr, numf = scipy.optimize.fminbound(error_function, min(var_space), max(var_space), args = (col, data.values.tolist(), label), full_output = 1)
-	# 	if not min_error or error < min_error:
-	# 		min_error = error
-	# 		split_var = col
-	# 		min_split = split
 	splitting_feature = (split_var, min_split)
-	#######
 	children = split_children(data, label, split_var, min_split)
-	# print "children:", children
-	print 'found best split'
-	# exit()
-
-	# # find splitting features    
-	# print 'find best_split'
-	# splitting_feature, children = find_best_split(data, label, remaining_features)  #tuple(id, value)
-	# print 'found best split'
-	# # print "split feature: ", splitting_feature
-	# # remove current features
-	# remaining_features[splitting_feature[0]].remove(splitting_feature[1])
+
 	left_data, left_label, right_data, right_label = children
 	if len(left_label) == 0 or len(right_label) == 0:
-		print current_depth
-		print "One side is empty"
 		return create_leaf(label)
-	# print 'left label: ' + str(left_label)
 
 	left_least_square = least_square(left_label)
 
 	# Create a leaf node if the split is "perfect"
 	if left_least_square < ideal_ls:
-		# print "current left ls: ", least_square(left_label), left_label
-		print current_depth
-		print "left stop because less than ls ."
 		return create_leaf(left_label)
 	if least_square(right_label) < ideal_ls:
-		# print "right ls: ", least_square(right_label),right_label
-		print current_depth
-		print "right stop becasue less than ls."
 		return create_leaf(right_label)
 
 	# recurse on children
@@ -200,12 +149,6 @@ def create_tree(data, all_pos_split, label, max_depth, ideal_ls, current_depth =
 			'index'			   : None}
 
 def error_function(split_point, split_var, data, label):
-	# left_index = [index for index in xrange(len(data.iloc[:,split_var])) if data.iloc[index,split_var] < split_point]
-	# right_index = [index for index in xrange(len(data.iloc[:,split_var])) if data.iloc[index,split_var] >= split_point]
-	# left_data = data.iloc[left_index,:]
-	# right_data = data.iloc[right_index,:]
-	# left_label = [label[i] for i in left_index]
-	# right_label =[label[i] for i in right_index]
 	data1 = []
 	data2 = []
 	for i in xrange(len(data)):
@@ -221,7 +164,7 @@ def make_prediction(tree, x, annotate = False):
 	if tree['is_leaf']:
 		if annotate: 
 			print "At leaf, predicting %s" % tree['value']
-		return tree['value']#tree['index']#, tree['value'] 
+		return tree['value'] 
 	else:
 		# the splitting value of x.
 		split_feature_value = x[tree['splitting_feature'][0]]
@@ -250,7 +193,6 @@ def fit(self):
 		for dat, col in pool.map(get_splitting_points, zip(splitting_data, cols)):
 			all_pos_split[col] = dat
 		pool.close()
-		print 'actually making trees'
 		self.tree = create_tree(self.training_data, all_pos_split, self.labels, self.max_depth, self.ideal_ls)
 
 
@@ -268,18 +210,4 @@ def predict(self, test):
 	model = RegressionTree(data, label)
 	model.fit()
 	print model.predict(test)
-
-	# all_pos_split = {}
-	# pool = Pool()
-	# splitting_data = [data.iloc[:,col].tolist() for col in xrange(data.shape[1])]
-	# cols = [col for col in xrange(data.shape[1])]
-	# for dat, col in pool.map(get_splitting_points, zip(splitting_data, cols)):
-	# 	all_pos_split[col] = dat
-	# pool.close()
-
-	# # non parallel code
-	# # for col in range(data.shape[1]):
-	# # 	all_pos_split[col] = get_splitting_points(data.iloc[:,col].tolist())
-
-	# tree = create_tree(data, all_pos_split, label, current_depth = 0)
-	# print output(tree, test)
+
diff --git a/lambdamart.py b/lambdamart.py
@@ -159,7 +159,7 @@ def group_queries(training_data, qid_index):
 		Parameters
 		----------
 		training_data : Numpy array of lists
-			Contains a list of document information. Each document’s format is [relevance score, query index, feature vector]
+			Contains a list of document information. Each document's format is [relevance score, query index, feature vector]
 		qid_index : int
 			This is the index where the qid is located in the training data
 		
@@ -214,9 +214,9 @@ def __init__(self, training_data=None, number_of_trees=5, learning_rate=0.1, tre
 			Number of trees LambdaMART goes through
 		learning_rate : float (default: 0.1)
 			Rate at which we update our prediction with each tree
-		tree_type : string (default: “sklearn”)
-			Either “sklearn” for using Sklearn implementation of the tree or “original” for using 
-			our implementation of the tree.
+		tree_type : string (default: "sklearn")
+			Either "sklearn" for using Sklearn implementation of the tree of "original" 
+			for using our implementation
 		"""
 
 		if tree_type != 'sklearn' and tree_type != 'original':
@@ -276,7 +276,7 @@ def predict(self, data):
 		Parameters
 		----------
 		data : Numpy array of documents
-			Numpy array of documents with each document’s format is [query index, feature vector]
+			Numpy array of documents with each document's format is [query index, feature vector]
 		
 		Returns
 		-------
@@ -299,7 +299,7 @@ def validate(self, data, k):
 		Parameters
 		----------
 		data : Numpy array of documents
-			Numpy array of documents with each document’s format is [relevance score, query index, feature vector]
+			Numpy array of documents with each document's format is [relevance score, query index, feature vector]
 		k : int
 			this is used to compute the NDCG@k
 		
@@ -331,25 +331,25 @@ def validate(self, data, k):
 
 	def save(self, fname):
 		"""
-		Saves the model into a “.lmart” file with the name given as a parameter.
+		Saves the model into a ".lmart" file with the name given as a parameter.
 		Parameters
 		----------
 		fname : string
 			Filename of the file you want to save
 		
 		"""
-		pickle.dump(self, open('%s.lmart' % (fname), "w"), protocol=2)
+		pickle.dump(self, open('%s.lmart' % (fname), "wb"), protocol=2)
 
 	def load(self, fname):
 		"""
-		Loads the model from the “.lmart” file given as a parameter.
+		Loads the model from the ".lmart" file given as a parameter.
 		Parameters
 		----------
 		fname : string
 			Filename of the file you want to load
 		
 		"""
-		model = pickle.load(open(fname , "r"))
+		model = pickle.load(open(fname , "rb"))
 		self.training_data = model.training_data
 		self.number_of_trees = model.number_of_trees
 		self.tree_type = model.tree_type