-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscoring_function.py
109 lines (94 loc) · 5.06 KB
/
scoring_function.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
__author__ = 'Jus'
# toolkits
# import matplotlib.pyplot as plt
import json
# import math as mt
# import numpy as np
# def scoring_function(list1_author, list1_list2_sentiment, list1_list2_relevance, stock_tuples) - deleted
# bas_object = [ ['Company_name1-string', 'sentiment_score1-float', 'relevance_score1-float', 'stock_score1-float'],
# ['Company_name2-string', 'sentiment_score2-float', 'relevance_score2-float', 'stock_score2-float'],
# ...
# ]
# My output would be a list of strings (you'll see why strings, and not floats), where each string is a score. Therefore
# each element in the list would be a score to the company it corresponds to.
# So output -> return_value = ['score_company_1-string', 'score_company_2-string', ......]
# sentiment = [-1,1]
# relevance = [0,1]
# After the normalized stock score is computed, given to me as an input, I will use whether it is
# a net positive, or net negative to see whether it agrees with the sentiment.
# In general, the more positive the score, the higher we should rank the document. As a lot of
# the ranking depends on the interval we choose to return the results in, it would be best to
# run a trial version of the scoring function and compare how each data set was scored. That would
# allow me to make modifications to the scoring function for any bugs, as required.
# In order to visualize the data and see the edge cases of our scoring function, I also decided to plot two graphs.
# The first graph is mainly a curve of the score of the companies, as calculated by the scoring function (1)
# The second graph is a plot of the stock scores over the number of companies. I will refine this graph by taking
# stock values used to compute the score, and seeing if I can get some previous data and extrapolate the future stock
# value as a fourier (next line)
# analysis combined with a sort of HMM (for which Connor already wrote some code.) =)
# Mainly, these graphs are for us, but maybe we could extend them to use ideas to plot predicted stock behavior (in
# a very elementary way), or show the accuracy of an author, and plot his accuracy vs. time, assuming he has more
# than one article. Or, if that proposal is not good, we could just plot the accuracy as a score, for each company.
# Juspreet Sandhu
def scoring_function(base_object):
num_companies = len(base_object)
alpha = float(.4)
return_value = []
stock_value_score = []
for i in range(num_companies):
curr_object = base_object[i]
# if (len(curr_object) == 4):
if curr_object[3] != 0:
stock_value_score.append(float(curr_object[3]))
# company_name = curr_object[0]
sentiment_value = float(curr_object[1])
relevance_value = float(curr_object[2])
stock_value = float(curr_object[3])
if (stock_value > 0 and sentiment_value > 0) or (stock_value < 0 and sentiment_value < 0):
return_value.append(str(alpha*(sentiment_value/stock_value) + (1 - alpha)*relevance_value))
elif (stock_value > 0 > sentiment_value) or (stock_value < 0 < sentiment_value):
if -.5 < stock_value < .5: # Even if the sentiment is opposite to the reality, I want to punish less
return_value.append(str(alpha*(sentiment_value/(2*stock_value)) + (1 - alpha)*relevance_value))
else:
return_value.append(str(alpha*(sentiment_value/stock_value) + (1 - alpha)*relevance_value))
else:
stock_score = "-12.5 , Available data does not fit parameters"
return_value.append(stock_score)
else:
stock_value_score.append(float(0))
sentiment_value = float(curr_object[1])
relevance_value = float(curr_object[2])
beta = sentiment_value/(1 + sentiment_value)
gamma = relevance_value/(1 + relevance_value)
if (sentiment_value > .5) and (relevance_value > .5):
return_value.append(str(beta + relevance_value/2))
elif (sentiment_value > .5) and (relevance_value < .5):
return_value.append(str(gamma + relevance_value/1.5))
elif (sentiment_value < .5) and (relevance_value > .5):
return_value.append(str(beta*relevance_value))
else:
return_value.append(str(sentiment_value*relevance_value))
num_processed = len(return_value)
if num_companies == num_processed:
json_string_list = json.dumps(return_value, separators=',')
return json_string_list
else:
return "Error in computing scores for DataSet !"
# x = []
# y = []
# def plotting_function1(list_scores):
# for i in range(len(list_scores)):
# x.append(i)
# y.append(list_scores[i])
# plt.xlabel(' Number of Companies')
# plt.ylabel(' Scores')
# plt.plot(x, y)
# plt.show()
# def plotting_function2(list_stock_scores):
# for i in range(len(list_stock_scores)):
# x.append(i)
# y.append(list_stock_scores[i])
# plt.xlabel(' Number of Companies')
# plt.ylabel(' Stock Behavior Scores')
# plt.plot(x, y)
# plt.show()