-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathTIdatabase.py
247 lines (210 loc) · 8.55 KB
/
TIdatabase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import string
import random as random
import pandas as pd
import numpy as np
import os
# Module globals
studentDF = None
collegeDF = None
class Student:
def __init__(self):
global studentDF
self.keysize = 10
self.factorcolumns = ['canAfford', 'female', 'MinorityGender','MinorityRace',
'international','firstinfamily','sports','artist', 'workexp','schooltype']
self.columnlist = ['studentID','classrank', 'admissionstest','AP','averageAP',
'SATsubject', 'GPA', 'GPA_w', 'program',
'intendedgradyear', 'addInfo'] + self.factorcolumns
if (studentDF is None):
studentDF = pd.DataFrame(columns = self.columnlist)
return
def newstudentID(self):
return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(self.keysize))
@property
def df(self):
global studentDF
return studentDF
@df.setter
def df(self, df):
global studentDF
if not isinstance(df, pd.DataFrame):
raise TypeError('Expected a Pandas DataFrame')
studentDF = df
def cleanup(self):
"""
reinitialize all the globals
"""
global studentDF
studentDF = None
def insertrow(self, row):
global studentDF
if (not isinstance(row,dict)):
raise TypeError("only dicts can be used to insert")
studentID = self.newstudentID()
# check for a rare random collision until we get a unique value
while (studentID in studentDF.studentID.values):
studentID = self.newstudentID()
row['studentID'] = studentID
# Leave and missing indicator variables as NaN
#This is how we would fill any missing indicator columns with 0's
#---for c in self.factorcolumns:
#--- if (not c in row):
#--- row[c] = 0
return row
def insert(self,args):
"""
Insert either a single row (when a dict is passed) or a list of rows
"""
global studentDF
rows = []
newstudentIDs = []
if (isinstance(args, dict)):
newrow = self.insertrow(args)
rows.append(newrow)
newstudentIDs.append(newrow['studentID'])
elif (isinstance(args, list)):
for i in args:
newrow = self.insertrow(i)
rows.append(newrow)
newstudentIDs.append(newrow['studentID'])
else:
raise TypeError("insert either a single dict or a list of dicts")
studentDF = studentDF.append(rows)
return newstudentIDs
def read(self,filename):
global studentDF
studentDF = pd.read_csv(filename, index_col=0)
def save(self,filename):
global studentDF
return studentDF.to_csv(filename)
def fillRandom(self, nrows, nanpct = 0.0):
global studentDF
"""
populate the dataframe with n random rows.
"""
for i in range(nrows):
studentDF.loc[i] = [
self.newstudentID(), # studentID
random.random(), # classrank
random.random(), # admissiontest
random.random(), # AP
random.random(), # averageAP
random.random(), # SAT subject
random.random(), # GPA
random.random(), # Weighted GPA
''.join(random.choice(string.ascii_uppercase) for i in range(12)), #program
random.randint(2009,2020), # grad year
''.join(random.choice(string.ascii_uppercase) for i in range(12)), #addInfo
random.randint(0,1), # canAfford
random.randint(0,1), # female
random.randint(0,1), # MinorityGender
random.randint(0,1), # MinorityRace
random.randint(0,1), # international
random.randint(0,1), # firstinfamily
random.randint(0,1), # sports
random.randint(0,1), # artist
random.randint(0,1), # workexp
random.randint(0,1) # schooltype
]
# Randomly insert NaNs
if (nanpct > 0.0):
ncols = int( round(len(self.columnlist) * nanpct)) + 1
if (ncols > len(self.columnlist)): ncols = len(self.columnlist)
for c in random.sample(self.columnlist, ncols):
if (c == 'studentID'): continue
studentDF.loc[i,c] = np.nan
class College:
def __init__(self):
global collegeDF
collegeDF = pd.read_csv(os.path.join(os.path.dirname(__file__),"collegelist.csv"))
#collegeDF = pd.DataFrame(columns =
# ['collegeID','name','acceptrate','size','public',
# 'finAidPct','instatePct'])
return
@property
def df(self):
global collegeDF
return collegeDF
@df.setter
def df(self, df):
global collegeDF
if not isinstance(df, pd.DataFrame):
raise TypeError('Expected a Pandas DataFrame')
collegeDF = df
class ApplForm:
"""
This contains an application for a given college and the results. Note that
it is not global as it does not need to shared.
"""
def __init__(self):
global studentDF, collegeDF
self.ApplFormDF = pd.DataFrame(columns =
['studentID','collegeID','earlyAppl','visited',
'alumni', 'outofstate', 'acceptStatus','acceptProb'])
return
@property
def df(self):
return self.ApplFormDF
@df.setter
def df(self, df):
if not isinstance(df, pd.DataFrame):
raise TypeError('Expected a Pandas DataFrame')
self.ApplFormDF = df
def insertrow(self, row):
global studentDF, collegeDF
if (not isinstance(row,dict)):
raise TypeError("only dicts can be used to insert")
# check the foreign keys exist in studentDF and collegeDF
if (row['studentID'] not in studentDF.studentID.values):
raise KeyError("The studentID does not exist in StudentDF")
return
if (row['collegeID'] not in collegeDF.collegeID.values):
raise KeyError("The collegeID does not exist in CollegeDF")
return
if (self.ApplFormDF.loc[(self.ApplFormDF['studentID'] == row['studentID']) &
(self.ApplFormDF['collegeID'] == row['collegeID']) , 'visited' ].count() != 0):
raise KeyError("This combination of student and college already exists")
return
return row
def insert(self,args):
"""
Insert either a single row (when a dict is passed) or a list of rows
"""
rows = []
if (isinstance(args, dict)):
rows.append(self.insertrow(args))
elif (isinstance(args, list)):
for i in args:
rows.append(self.insertrow(i))
else:
raise TypeError("insert either a single dict or a list of dicts")
self.ApplFormDF = self.ApplFormDF.append(rows)
def read(self,filename):
self.ApplFormDF = pd.read_csv(filename, index_col=0)
def save(self,filename):
return self.ApplFormDF.to_csv(filename)
def fillRandom(self,nrows):
global studentDF, collegeDF
i = 0
while (i < nrows):
studentID = studentDF.sample(1).studentID.iloc[0]
collegeID = collegeDF.sample(1).collegeID.iloc[0]
# Make sure we don't have this combination already
if (self.ApplFormDF.loc[(self.ApplFormDF['studentID'] == studentID) &
(self.ApplFormDF['collegeID'] == collegeID) , 'visited' ].count() == 0):
# add in a new record
self.ApplFormDF.loc[i] = [studentID, collegeID,
random.randint(0,1), # earlyAppl
random.randint(0,1), # visited
random.randint(0,1), # alumni
random.randint(0,1), # outofstate
random.randint(0,1), # acceptStatus
random.random() # acceptProb
]
i += 1
class Application:
"""
Consists of one student and one ApplForm
"""
def __init__(self):
return