-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathcomposite.py
166 lines (138 loc) · 4.93 KB
/
composite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
'''
# Adds a composite variable to the input data and dictionary, using the provided script
@copyright: Fathom Information Design 2014
'''
import csv
import sys
import os
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from importlib import import_module
mira_path = sys.argv[1]
user_script = sys.argv[2]
overwrite = True
append_str = ""
if len(sys.argv) == 4:
overwrite = False
append_str = sys.argv[3]
work_path = os.getcwd()
[module_path, module_filename] = os.path.split(user_script)
module_path = os.path.abspath(module_path)
module_filename = module_filename.split('.')[0]
sys.path.insert(0, module_path)
module = import_module(module_filename)
try:
# Changing to script folder just in case it opens some files during initialization
os.chdir(module_path)
module.init()
os.chdir(work_path)
except AttributeError:
pass
print("ADDING COMPOSITE " + module.get_name() + " TO MIRADOR DATASET IN " + mira_path + "...")
datafile = mira_path + "/data.tsv"
dictfile = mira_path + "/dictionary.tsv"
grpfile = mira_path + "/groups.xml"
binfile = mira_path + "/data.bin"
if overwrite and os.path.isfile(binfile):
os.remove(binfile)
if overwrite:
datafile1 = datafile
dictfile1 = dictfile
grpfile1 = grpfile
else:
datafile1 = mira_path + "/data" + append_str + ".tsv"
dictfile1 = mira_path + "/dictionary" + append_str + ".tsv"
grpfile1 = mira_path + "/groups" + append_str + ".xml"
data = []
dict = []
with open(datafile, 'r') as tsv:
for row in csv.reader(tsv, dialect="excel-tab"):
data.append(row)
titles = [str(x) for x in data[0]]
variables = module.variables()
for var in variables:
if not var in titles:
print("ERROR: the variable", var, "is required to calculate", module.get_name(), " but is not present in the dataset")
sys.exit(1)
var_weights = {}
weight_types = {}
with open(dictfile) as tsv:
i = 0
for row in csv.reader(tsv, dialect="excel-tab"):
# Get the name from the table titles, since the dictionary will contain the aliases
name = titles[i]
if len(row) == 4:
if name in variables:
var_weights[name] = row[3]
if row[3] == 'sample weight' or row[3] == 'subsample weight':
weight_types[name] = row[3]
dict.append(row)
i = i + 1
with open(datafile1, "w") as tsv:
writer = csv.writer(tsv, dialect="excel-tab")
titles.append(module.get_name())
writer.writerow(titles)
for x in range(1, len(data)):
values = {}
for var in variables:
values[var] = data[x][titles.index(var)]
data[x].append(module.calculate(values))
writer.writerow(data[x])
# Create dictionary entry for the composite variable
dict_entry = [module.get_title()]
dict_entry.append(module.get_type())
dict_entry.append(module.get_range())
if 0 < len(var_weights):
# Adding weight variable
weight = ''
for var in variables:
weight1 = var_weights[var]
if not weight:
weight = weight1
else:
t0 = weight_types[weight]
t1 = weight_types[weight1]
if t0 == 'sample weight' and t1 == 'sample weight' and ('INT' in weight) and ('MEC' in weight1):
# MEC sample weights have priority over INT weights
weight = weight1
elif t0 == 'sample weight' and t1 == 'subsample weight':
# Subsample weights have priority over sample weights
weight = weight1
elif t0 == 'subsample weight' and t1 == 'subsample weight' and weight != var_weights[var]:
print("Warning: the composite variable is calculated across different subsamples")
dict_entry.append(weight)
with open(dictfile1, "w") as tsv:
writer = csv.writer(tsv, dialect="excel-tab")
for row in dict:
writer.writerow(row)
writer.writerow(dict_entry)
# Add entry to the groups file
tree = ET.parse(grpfile)
root = tree.getroot()
comp_group = None
for group in root.findall('group'):
if group.attrib['name'] == 'Composites':
comp_group = group
break
# TODO: specify the location to insert the composite group as an argument
if comp_group is None:
comp_group = ET.Element('group')
comp_group.set("name", "Composites")
root.insert(len(root) - 1, comp_group)
comp_table = None
for table in comp_group.findall('table'):
if table.attrib['name'] == module.get_table():
comp_table = table
break
if comp_table is None:
comp_table = ET.SubElement(comp_group, "table")
comp_table.set("name", module.get_table())
variable = ET.SubElement(comp_table, "variable")
variable.set("name", module.get_name())
# Prettyfing XML string using BeautifulSoup and writing to file
xml_soup = BeautifulSoup(ET.tostring(root), "xml")
pretty_xml = xml_soup.prettify()
xml_file = open(grpfile1, "w")
xml_file.write(pretty_xml)
xml_file.close()
print("DONE")