-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhotEncoding.py
143 lines (113 loc) · 6.44 KB
/
hotEncoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# ----------------------------------------------------------------------
# creates the hot encoded version of the data set adult
# The list of feature is known and fixed. Some of them are mapped
# directly with an integer, but some of them are hot encoded because
# there is not a order between the possible values of that feuture.
# The class HotEconding takes the list of possible values and creates a
# mapping to a list of string of "1" and "0". Infact at this stage the
# tensor still contains strings.
#
# In "hotEncoding" function are fist created all the various encoding
# for the attributes and then are used to create a new tensor
#
# Andrea Cucchietti, 2022
# ----------------------------------------------------------------------
import numpy as np
import copy
LIST_NAMES_FEATURE = [b"age", b"workclass", b"fnlwgt", b"education", b"education-num", b"marital-status", b"occupation",
b"relationship", b"race", b"sex", b"capital-gain", b"capital-loss", b"hours-per-week", b"native-country", b"income"]
POSSIBLEVALUESFORFEATURE = {
b"sex": [b"Female", b"Male", b"?"],
b"workclass": [b"Private", b"Self-emp-not-inc", b"Self-emp-inc",
b"Federal-gov", b"Local-gov", b"State-gov", b"Without-pay", b"Never-worked", b"?"],
b"marital-status": [b"Married-civ-spouse", b"Divorced", b"Never-married",
b"Separated", b"Widowed", b"Married-spouse-absent", b"Married-AF-spouse", b"?"],
b"occupation": [b"Tech-support", b"Craft-repair", b"Other-service", b"Sales", b"Exec-managerial", b"Prof-specialty", b"Handlers-cleaners",
b"Machine-op-inspct", b"Adm-clerical", b"Farming-fishing", b"Transport-moving", b"Priv-house-serv", b"Protective-serv", b"Armed-Forces", b"?"],
b"relationship": [b"Wife", b"Own-child", b"Husband",
b"Not-in-family", b"Other-relative", b"Unmarried", b"?"],
b"race": [b"White", b"Asian-Pac-Islander",
b"Amer-Indian-Eskimo", b"Other", b"Black", b"?"],
b"native-country": [b"United-States", b"Cambodia", b"England", b"Puerto-Rico", b"Canada", b"Germany", b"Outlying-US(Guam-USVI-etc)", b"India", b"Japan", b"Greece", b"South", b"China", b"Cuba", b"Iran", b"Honduras", b"Philippines", b"Italy", b"Poland", b"Jamaica", b"Vietnam", b"Mexico",
b"Portugal", b"Ireland", b"France", b"Dominican-Republic", b"Laos", b"Ecuador", b"Taiwan", b"Haiti", b"Columbia", b"Hungary", b"Guatemala", b"Nicaragua", b"Scotland", b"Thailand", b"Yugoslavia", b"El-Salvador", b"Trinadad&Tobago", b"Peru", b"Hong", b"Holand-Netherlands", b"?"]
}
class HotEncoding(object):
def __init__(self, listPossibleValue) -> None:
self._map = {}
self._listPossibleValues = listPossibleValue
self._numPossibleValues = len(listPossibleValue)
self.__generateMapping()
def __generateMapping(self):
for (i, word) in enumerate(self._listPossibleValues):
wordHotEncoded = np.zeros(self._numPossibleValues)
wordHotEncoded[i] = 1
self._map[word] = wordHotEncoded.astype(dtype="S1")
def getNumPossibleValues(self):
return self._numPossibleValues
def getEncodedFromValue(self, value):
return self._map[value]
def HotEncodeOldRow(self, oldRow, newRow, startNewPosFeature, oldPosFeature):
''' changes the row accordingly to the hot encoding and then returns the last position of the feature encoded'''
endPosFeature = startNewPosFeature + self._numPossibleValues
newRow[startNewPosFeature:endPosFeature] = self.getEncodedFromValue(
oldRow[oldPosFeature])
return endPosFeature
def getListFeatureAfterHotEnc():
namesFeatures = copy.copy(LIST_NAMES_FEATURE)
encodedFeatures = POSSIBLEVALUESFORFEATURE.keys()
listAfterEncFeatures = []
for i in range(0, len(namesFeatures)):
if namesFeatures[i] in encodedFeatures:
#associate every ? to the original feature
possibleValues = copy.copy(POSSIBLEVALUESFORFEATURE[namesFeatures[i]])
for k in range(0, len(possibleValues)):
if possibleValues[k] == b"?":
possibleValues[k] = b"? "+ namesFeatures[i]
listAfterEncFeatures += possibleValues
else:
listAfterEncFeatures.append(namesFeatures[i])
return listAfterEncFeatures
def generateHotEncoding(adultData):
dictHotEncoding = generateEncodings(POSSIBLEVALUESFORFEATURE)
newNumberOfColumns = getNewNumCol(dictHotEncoding, adultData)
newAdultData = generateNewTensor(
adultData, newNumberOfColumns, POSSIBLEVALUESFORFEATURE, dictHotEncoding)
del adultData
adultData = newAdultData
return adultData
def generateEncodings(possibleValuesForFeature):
dictHotEncoding = {}
for nameFeature in possibleValuesForFeature:
Encoding = HotEncoding(possibleValuesForFeature[nameFeature])
dictHotEncoding[nameFeature] = Encoding
return dictHotEncoding
def getNewNumCol(dictHotEncoding, adultData):
newNumberOfColumns = adultData.shape[1]
for nameFeature in dictHotEncoding:
# -1 because 1 element for this feature was already existing
newNumberOfColumns += dictHotEncoding[nameFeature].getNumPossibleValues() - 1
return newNumberOfColumns
def generateNewTensor(adultData, newNumberOfColumns, possibleValuesForFeature, dictHotEncoding):
# TODO: change the dimention accordingly to the max lenght of the strings
newAdultData = np.empty(
dtype="S30", shape=[adultData.shape[0], newNumberOfColumns])
listNamesHotEncodedFeatures = possibleValuesForFeature.keys()
fillNewTensor(newAdultData, adultData,
listNamesHotEncodedFeatures, dictHotEncoding)
return newAdultData
def fillNewTensor(newAdultData, adultData, listNamesHotEncodedFeatures, dictHotEncoding):
for newRow, oldRow in zip(newAdultData, adultData):
fillRowNewTensor(
oldRow, newRow, listNamesHotEncodedFeatures, dictHotEncoding)
def fillRowNewTensor(oldRow, newRow, listNamesHotEncodedFeatures, dictHotEncoding):
posInRow = 0
oldPosInRow = 0
for nameFeature in LIST_NAMES_FEATURE:
if nameFeature in listNamesHotEncodedFeatures:
Encoding = dictHotEncoding[nameFeature]
posInRow = Encoding.HotEncodeOldRow(
oldRow, newRow, posInRow, oldPosInRow)
else:
newRow[posInRow] = oldRow[oldPosInRow]
posInRow += 1
oldPosInRow += 1