-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcreate_toc.py
212 lines (198 loc) · 9.87 KB
/
create_toc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
"""
Script that fetches info from table tableofcontents in old db and transforms it
into one toc json file for each new collection.
Created by Anna Movall and Jonas Lillqvist in March 2020.
"""
import mysql.connector
import psycopg2
import json
import operator
import re
# insert current project id here
PROJECT_ID = 10
conn_old_db = mysql.connector.connect(
host="",
database="",
user="",
passwd=""
)
cursor_old = conn_old_db.cursor()
conn_new_db = psycopg2.connect(
host="",
database="",
user="",
port="",
password=""
)
cursor_new = conn_new_db.cursor()
def read_dict_from_file(filename):
with open(filename, encoding="utf-8") as source_file:
json_content = json.load(source_file)
return json_content
# get info about toc items in one collection from old db
def get_toc_info(old_collection_id):
# collection Letters (Brev) was split into two new collections (30, 31), which are not part of old table publications or tableofcontents, only table publications_collections
# we need to change their old collection id to the actual old id 15 and not use publications_collections id 30/31, found in the coll_id_dict
if old_collection_id == "30":
fetch_query = """SELECT tableofcontents.title, toc_date, toc_linkID, tableofcontents.sortOrder, publications_group.sortOrder FROM tableofcontents, publications_group WHERE toc_zts_id=%s AND toc_coll_id=%s AND toc_group_id=group_id"""
values_to_insert = (15, 1)
elif old_collection_id == "31":
fetch_query = """SELECT title, toc_date, toc_linkID, sortOrder FROM tableofcontents WHERE toc_zts_id=%s AND toc_coll_id=%s"""
values_to_insert = (15, 2)
else:
fetch_query = """SELECT title, toc_date, toc_linkID, sortOrder FROM tableofcontents WHERE toc_zts_id=%s"""
values_to_insert = (old_collection_id,)
cursor_old.execute(fetch_query, values_to_insert)
toc_info = cursor_old.fetchall()
# the date value for Brev needs to be edited so that None is substituted with "0"
# otherwise sorting by date is not possible
# to be editable, the tuples in toc_info need to be lists
if old_collection_id == "30" or old_collection_id == "31":
toc_info_list = []
for tuple in toc_info:
row_list = list(tuple)
if row_list[1] is None:
row_list[1] = "0"
toc_info_list.append(row_list)
# for Forlagskorrespondens, sort based on publications_group.sortOrder, then based on date
if old_collection_id == "30":
toc_info_sorted = sorted(toc_info_list, key = operator.itemgetter(4,1))
# for Foraldrakorrespondens, sort based on date
elif old_collection_id == "31":
toc_info_sorted = sorted(toc_info_list, key = operator.itemgetter(1))
# for other collections, sort based on sortOrder
else:
toc_info_sorted = sorted(toc_info, key = operator.itemgetter(3))
return toc_info_sorted
# creates toc dictionary, used for json file
def create_dictionary(toc_info_sorted, old_collection_id, new_collection_id):
# use this if the dictionary has collections with no publications in the db
if len(toc_info_sorted) == 0:
collection_toc_dict = {}
print("List empty. Collection id old/new: ", old_collection_id, new_collection_id)
return False
# the first row in the list contains the name of the collection
collection_name = toc_info_sorted[0][0]
# create first level of dictionary as required for json toc
collection_toc_dict = {"text": collection_name, "collectionId": str(new_collection_id), "type": "title", "children": []}
# loop through toc_info_sorted, skip first row which contains collection name
for i in range(1, len(toc_info_sorted)):
row = toc_info_sorted[i]
text_title = row[0]
toc_date = row[1]
toc_linkID = row[2]
# skip rows which refer to letters listed in tableofcontents, with no publication linked to them; these have "Mibr" in toc_linkID
if toc_linkID is not None:
match = re.search("Mibr", toc_linkID)
if match is not None:
continue
# an item id is required for the json toc items
itemId = add_itemId(row, old_collection_id, new_collection_id)
# special rule for Forelasningar, which contains descriptions pertaining to a title
# the text should be added to the previous toc item with key "description"; no new toc item is created
if itemId == "" and old_collection_id == "20":
toc_item_dict["description"] = text_title
continue
# toc items which stand for sections and do not link to texts have no itemId
elif itemId == "":
toc_type = "section_title"
# items with itemId are links to reading texts, with type est
else:
toc_type = "est"
if toc_date is None:
toc_date = ""
# create dict for toc item and append it to list of children of first level dict
toc_item_dict = {"url": "", "type": toc_type, "text": text_title, "itemId": itemId, "date": toc_date}
collection_toc_dict["children"].append(toc_item_dict)
return collection_toc_dict
# constructs itemId based on new collection id and new publication id
# to find out publication id, we need legacy_id
# which is constructed using old collection id and toc_linkID from old db
def add_itemId(row, old_collection_id, new_collection_id):
toc_linkID = row[2]
if toc_linkID is None or toc_linkID == "":
itemId = ""
# a toc_linkID that contains ch + 1-3 digits and possibly pos + 1-3 digits is a special kind of link
# it refers to a part of a file (a div or an anchor-element)
# the part beginning with ch or pos, the fragment_id, needs to be added to itemID
# but it has to be removed from toc_linkID before constructing the legacy_id
else:
pattern = re.compile(r";(ch\d{1,3}(;pos\d{1,4})?)")
match = re.search(pattern, toc_linkID)
if match is not None:
fragment_id = match.group(1)
toc_linkID = re.sub(pattern, "", toc_linkID)
else:
fragment_id = 0
# collection Letters (Brev) was split into two new collections (30, 31), which are not part of old table publications or tableofcontents, only table publications_collections
# we need to change their old collection id to the actual old id 15 and not use publications_collections id 30/31, found in the coll_id_dict
if old_collection_id == "30" or old_collection_id == "31":
old_collection_id = "15"
legacy_id = old_collection_id + "_" + toc_linkID
publication_id = fetch_publication_id(legacy_id)
if fragment_id == 0:
itemId = str(new_collection_id) + "_" + str(publication_id)
else:
itemId = str(new_collection_id) + "_" + str(publication_id) + "_" + fragment_id
return itemId
# get publication_id from new db using legacy_id
# limit selection to publications connected to the current publication collection
def fetch_publication_id(legacy_id):
fetch_query = """SELECT id FROM publication WHERE legacy_id=%s AND publication_collection_id IN (SELECT id FROM publication_collection WHERE project_id=%s)"""
value_to_insert = (legacy_id, PROJECT_ID)
cursor_new.execute(fetch_query, value_to_insert)
result = cursor_new.fetchone()
if result is None:
print(legacy_id, "not found in publication")
publication_id = ""
else:
publication_id = result[0]
return publication_id
def write_dict_to_file(dictionary, filename):
json_dict = json.dumps(dictionary, ensure_ascii=False)
with open(filename, "w", encoding="utf-8") as output_file:
output_file.write(json_dict)
print("Dictionary written to file", filename)
# special function for generating toc for Lfb: values from csv, not from table tableofcontents
def create_toc_for_Lfb(filename, collection_id_dict):
lfb_list = create_list_from_csv(filename)
new_collection_id = collection_id_dict["32"]
collection_toc_dict = {"text": "Läsning för barn", "collectionId": str(new_collection_id), "type": "title", "children": []}
for row in lfb_list:
title = row[0]
legacy_id = row[3]
publication_id = fetch_publication_id(legacy_id)
itemId = str(new_collection_id) + "_" + str(publication_id)
toc_item_dict = {"url": "", "type": "est", "text": title, "itemId": itemId, "date": ""}
collection_toc_dict["children"].append(toc_item_dict)
filename = "toc_files/" + str(new_collection_id) + ".json"
write_dict_to_file(collection_toc_dict, filename)
# creates a list from csv file with publication name, div id, publication id and legacy id
def create_list_from_csv(filename):
with open(filename, "r", encoding="utf-8") as source_file:
lfb_list = []
for line in source_file:
row = line.rstrip()
elements = row.split(";")
lfb_list.append(elements)
return lfb_list
def main():
collection_id_dict = read_dict_from_file("id_dictionaries/collection_ids.json")
for key in collection_id_dict.keys():
old_collection_id = key # string value!
# toc for Lasning for barn is created from csv file with a special function
if old_collection_id == "32":
continue
new_collection_id = collection_id_dict[old_collection_id]
toc_info_sorted = get_toc_info(old_collection_id)
# toc_info_sorted will be an empty list if key is not found in tableofcontents!
collection_toc_dict = create_dictionary(toc_info_sorted, old_collection_id, new_collection_id)
if collection_toc_dict:
filename = "toc_files/" + str(new_collection_id) + ".json"
write_dict_to_file(collection_toc_dict, filename)
create_toc_for_Lfb("csv/Lfb_split.csv", collection_id_dict)
conn_new_db.close()
cursor_new.close()
conn_old_db.close()
cursor_old.close()
main()