-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpopulate_publication_sent_letter.py
701 lines (668 loc) · 33.6 KB
/
populate_publication_sent_letter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
# This script populates table publication,
# and also other tables connected to publication, such as
# publication_manuscript, translation, translation_text, event, event_connection,
# event_occurrence. It also creates all the needed XML files for each
# publication and manuscript and updates the db with the file paths.
# The starting point is a csv file containing info about sent letters,
# which will be made into publications. This file was updated by one of the
# find_facsimiles-scripts, and this script adds more info to the file:
# the publication id and title. They will be needed later when populating
# table facsimile_collection.
# The script also needs the name_dictionary and the name_id_dictionary
# creted by deal_with_persons.py in order to connect publication with subject
# (in this case a letter and its recipient). It also uses other dictionaries
# in order to make genre and language values uniform, and a csv with
# info about persons.
# Sample input and output (CSV) at end of file.
import psycopg2
import re
import os
import json
from bs4 import BeautifulSoup
conn_db = psycopg2.connect(
host="",
database="",
user="",
port="",
password=""
)
cursor = conn_db.cursor()
COLLECTION_ID = 1
XML_OUTPUT_FOLDER = "documents/Delutgava_1/Brev/"
CSV_IN = "csv/sent_1_facsimiles.csv"
CSV_OUT = "csv/sent_1_facsimiles_id.csv"
# create a list from the original csv file
# replace empty values with None
def create_list_from_csv(filename):
with open(filename, "r", encoding="utf-8-sig") as source_file:
list = []
for line in source_file:
row = line.rstrip()
elements = row.split(";")
for i in range(0,len(elements)):
if elements[i] == "":
elements[i] = None
list.append(elements)
print("list length: " + str(len(elements)))
# get rid of empty value at the end of each list
if len(elements) == 20:
elements.pop(19)
# if this is the list's length, then there's an alternative facsimile
if len(elements) == 21:
elements.pop(20)
return list
# get dictionary content from file
def read_dict_from_file(filename):
with open(filename, encoding="utf-8-sig") as source_file:
json_content = json.load(source_file)
return json_content
# create a csv file
def write_list_to_csv(list, filename):
with open(filename, "w", encoding="utf-8-sig") as output_file:
for row in list:
for item in row:
if item is None:
item = ""
output_file.write(str(item) + ";")
output_file.write("\n")
print("List written to file", filename)
# populate table publication with sent letters/telegrams
# and create needed content in other tables, i.e. translation,
# translation_text, event, event_connection, event_occurrence,
# publication_manuscript
def create_sent_publication(COLLECTION_ID, persons_list, sent_letters, name_dictionary, genre_dictionary, language_dictionary, name_id_dictionary):
directory = XML_OUTPUT_FOLDER + "Sant"
directory_path = create_directory(directory)
insert_query = """INSERT INTO publication(publication_collection_id, published, genre, original_publication_date, original_language, archive_signum) VALUES(%s, %s, %s, %s, %s, %s) RETURNING id"""
for letter in sent_letters:
published = 1
category = letter[1]
# in this case (sent letters), use this spot for adding the
# subject.id of the receiver, if this person has been added
# to the db recently (normally this is where you
# define whether this document is written by LM or not)
# "x" signifies LM as the writer
if letter[2] != "x":
person_id = letter[2]
else:
person_id = None
# csv is in Finnish, but db has Swedish values for this
if category in genre_dictionary.keys():
genre = genre_dictionary[category]
else:
genre = category
original_date = letter[0]
original_publication_date, no_date, date_uncertain = replace_date(original_date)
language = letter[7]
# register the language(s)
original_language, language_for_db = register_language(language, language_dictionary)
unordered_name = letter[4]
# register the archive signums, old and new
# and the archive folder, if present
old_archive_signum = letter[13]
new_archive_signum = letter[10]
archive_folder = letter[8]
if old_archive_signum is not None and new_archive_signum is not None and archive_folder is not None and archive_folder != "KA":
archive_signum = old_archive_signum + ", " + new_archive_signum + ", " + archive_folder
elif old_archive_signum is not None and new_archive_signum is not None and archive_folder == "KA":
archive_signum = old_archive_signum + ", " + new_archive_signum
elif old_archive_signum is not None and new_archive_signum is not None and archive_folder is None:
archive_signum = old_archive_signum + ", " + new_archive_signum
# this is material from another person's archive than Mechelin's,
# but still at the National Archives
# (if new_archive_signum ends with ".pdf", it's not from the National Archives)
elif old_archive_signum is None and new_archive_signum is not None and not new_archive_signum.endswith(".pdf") and archive_folder is not None:
archive_signum = new_archive_signum + ", " + archive_folder
# this is material from another archive than the National Archives
else:
archive_signum = archive_folder
values_to_insert = (COLLECTION_ID, published, genre, original_publication_date, language_for_db, archive_signum)
cursor.execute(insert_query, values_to_insert)
publication_id = cursor.fetchone()[0]
# the title of the publication is in swe and fin
# the titles are kept in a different table, translation_text
# titles contain person info
# titles contain the date (almost) as it has been recorded originally
# this function may update person_id
person_legacy_id, person, title_swe, title_fin, translation_id, person_id = update_sent_publication_with_title(publication_id, unordered_name, persons_list, name_dictionary, original_date, no_date, date_uncertain, person_id)
# these are sent letters/telegrams, i.e. someone else than Mechelin received them
# update_sent_publication_with_title found out who the receiver was
# create connection between publication and receiver
if genre == "sänt telegram":
event_connection_type = "received telegram"
else:
event_connection_type = "received letter"
event_id = create_event_and_connection(person_legacy_id, person_id, event_connection_type, name_id_dictionary)
# the publication is still a sent one from Mechelin's point of view
# so let's register that as well when creating the event_occurrence
if genre == "sänt telegram":
event_occurrence_type = "sent telegram"
else:
event_occurrence_type = "sent letter"
create_event_occurrence(publication_id, event_id, event_occurrence_type)
# since these publications are manuscripts, populate table publication_manuscript
manuscript_type = 1
manuscript_id = create_publication_manuscript(publication_id, published, manuscript_type, archive_signum, language_for_db, title_swe)
# each publication always has two XML-files, a Swedish and a Finnish one
# if original_language is something else, then a third file will be created
# these files contain a template and the editors will fill them with content
# just like the titles, file paths are not kept in table publication
# update tables translation_text and publication_manuscript with the file paths
create_file(directory_path, person, person_id, original_publication_date, original_language, publication_id, translation_id, title_swe, title_fin, manuscript_id)
letter.extend((publication_id, title_swe))
print(letter)
print("Table publication updated with the new publications.")
conn_db.commit()
return sent_letters
# date has mainly been recorded as 1.1.1800
# make date format uniform and get rid of other characters
# new format: YYYY-MM-DD, use XX if some part of the date is missing
# if no date can be easily extracted from original_date,
# or if there is no date, make date XXXX-XX-XX
def replace_date(original_date):
date = "XXXX-XX-XX"
if original_date is not None:
original_date = original_date.replace("/", ".")
original_date = original_date.replace("[", "")
original_date = original_date.replace("]", "")
match_string = re.search(r"\?", original_date)
match_string_2 = re.search("^ca ", original_date)
if match_string or match_string_2:
original_date = original_date.replace("?", "")
original_date = original_date.replace("ca ", "")
date_uncertain = True
else:
date_uncertain = False
no_date = False
search_string = re.compile(r"(\d{1,2})\.(\d{1,2})\.(\d{4})")
match_string = re.search(search_string, original_date)
found = False
if match_string:
year = match_string.group(3)
month = match_string.group(2).zfill(2)
day = match_string.group(1).zfill(2)
date = year + "-" + month + "-" + day
found = True
# for cases where original_date was e.g. 18.?.1885
# and the ? got removed earlier in this function
if not found:
search_string = re.compile(r"^(\d{1,2})\.\.(\d{4})")
match_string = re.search(search_string, original_date)
if match_string:
year = match_string.group(2)
day = match_string.group(1).zfill(2)
date = year + "-XX-" + day
found = True
if not found:
search_string = re.compile(r"^(\d{1,2})\.(\d{4})")
match_string = re.search(search_string, original_date)
if match_string:
year = match_string.group(2)
month = match_string.group(1).zfill(2)
date = year + "-" + month + "-XX"
found = True
if not found:
search_string = re.compile(r"(\d{4})$")
match_string = re.search(search_string, original_date)
if match_string:
date = match_string.group(0)
date = date + "-XX-XX"
# for cases where original_date is e.g. "1880-talet"
# i.e. the 1880s: the year should be recorded as 188X
if not found:
search_string = re.compile(r"(\d{2})(\d{1})(\d{1})-tal")
match_string = re.search(search_string, original_date)
if match_string:
if (match_string.group(1) == "18" or match_string.group(1) == "19") and match_string.group(2) != "0":
date = match_string.group(1) + match_string.group(2) + "X-XX-XX"
if (match_string.group(1) == "18" or match_string.group(1) == "19") and match_string.group(2) == "0":
date = match_string.group(1) + "XX-XX-XX"
if original_date == "" or original_date is None:
date_uncertain = False
no_date = True
return date, no_date, date_uncertain
# there can be several languages for one publication
# meaning that the text e.g. can start off in Swedish
# and then continue in French
# all languages for a publication are recorded in the db
def register_language(language, language_dictionary):
original_language = []
if language is None:
original_language.append("xx")
else:
languages = language.split(", ")
for language in languages:
if language in language_dictionary.keys():
language = language_dictionary[language]
original_language.append(language)
else:
original_language.append("xx")
language_for_db = ", "
language_for_db = language_for_db.join(original_language)
return original_language, language_for_db
# create the titles for the publication
# it's a sent letter, so the title consists of date, sender and receiver
# there's a swe and a fin title
# due to the nature of the csv we started from, all titles are not going to
# be perfect after this, so some will have to be changed later by hand
# but hey, this is a humanist project
def update_sent_publication_with_title(publication_id, unordered_name, persons_list, name_dictionary, original_date, no_date, date_uncertain, person_id):
# if the sender's id was in the csv
if person_id is not None and person_id.isdigit():
title_name_part_swe, title_name_part_fin, person, person_id, person_legacy_id = check_subject(person_id)
# else look for person_legacy_id in name_dictionary using name as key
# or, for persons recently added to the db:
# look for their id (which is not person_legacy_id but the real db id)
# in said dictionary, using name as key, and use that id for identifying them
# name parts for titles are constructed differently depending on what id is used
elif unordered_name is not None and unordered_name in name_dictionary.keys():
old_or_new_id = name_dictionary[unordered_name]
# legacy_id consists of "pe" + a number
if "pe" in old_or_new_id:
person_legacy_id = old_or_new_id
title_name_part_swe, title_name_part_fin, person = construct_publication_title_name_part(persons_list, person_legacy_id, person_id)
# db id is a number
else:
person_id = old_or_new_id
title_name_part_swe, title_name_part_fin, person, person_id, person_legacy_id = check_subject(person_id)
# if name isn't in dictionary and there's no id, we don't know this sender
else:
person_legacy_id = None
person = None
person_id = 1912
title_name_part_swe = "okänd"
title_name_part_fin = "tuntematon"
# make some slight changes to original_date, if needed,
# since it'll be part of a title
# an original_date written as 1/13.11.1885 means that the document
# has both an old style (Julian) and a new style (Gregorian) date
# in that case we'll use the Gregorian (modern) one,
# which is the one on the right side of the slash
search_string = re.compile(r".*/")
original_date = search_string.sub("", original_date)
if no_date is True:
title_swe = "odaterat" + " LM–" + title_name_part_swe
title_fin = "päiväämätön" + " LM–" + title_name_part_fin
# if there's some uncertainty about the date, add a standard phrase
# and leave a "?" only if it signifies "month unknown"
# also, an "approx." has to be translated, so we can't just use the
# Swedish "ca" from the csv as such
# in the replace_date function we already set the flag
# which gives the right standard phrases in these cases
elif date_uncertain is True:
search_string = re.compile(r"^ca ")
original_date = search_string.sub("", original_date)
original_date = original_date.replace("?", "")
search_string = re.compile(r"\.\.")
original_date = search_string.sub(".?.", original_date)
search_string = re.compile(r"^\.")
original_date = search_string.sub("", original_date)
original_date = search_string.sub("", original_date)
title_swe = "ca " + original_date + " LM–" + title_name_part_swe
title_fin = "n. " + original_date + " LM–" + title_name_part_fin
else:
title_swe = original_date + " LM–" + title_name_part_swe
title_fin = original_date + " LM–" + title_name_part_fin
translation_id = create_translation()
field_name = "name"
table_name = "publication"
create_translation_text(translation_id, title_swe, title_fin, field_name, table_name)
update_query = """UPDATE publication SET translation_id = %s WHERE id = %s"""
values_to_insert = (translation_id, publication_id)
cursor.execute(update_query, values_to_insert)
return person_legacy_id, person, title_swe, title_fin, translation_id, person_id
# if the id is in the csv (due to the person having been added
# to the db very recently and therefore isn't present in the name_dictionary)
# or if the person has been added to name_dictionary with a new id
# and not with legacy_id (because he/she is a recent db addition)
def check_subject(person_id):
# check if this person exists in the db
# if not: replace id with id for "unknown person"
# thus handling e.g. typos in subject_id:s in the csv
# or a test db with different values than the production db
fetch_query = """SELECT id FROM subject WHERE id = %s"""
value_to_insert = (person_id,)
cursor.execute(fetch_query, value_to_insert)
subject_exists = cursor.fetchone()
if subject_exists is None or subject_exists[0] == 1912:
if subject_exists is None:
print("Person with id " + str(person_id) + " not in db!")
person_id = 1912
person = None
title_name_part_swe = "okänd"
title_name_part_fin = "tuntematon"
else:
title_name_part_swe, title_name_part_fin, person = construct_publication_title_name_part_from_id(person_id)
person_legacy_id = None
return title_name_part_swe, title_name_part_fin, person, person_id, person_legacy_id
# use person_legacy_id to find out the right person
def construct_publication_title_name_part(persons_list, person_legacy_id, person_id):
for person in persons_list:
legacy_id = person[13]
if person_legacy_id == legacy_id:
title_name_part_swe = construct_swe_name(person, person_id)
title_name_part_fin = construct_fin_name(person, person_id, title_name_part_swe)
break
return title_name_part_swe, title_name_part_fin, person
# if we already had the person's id, fetch name info
# and check for translated name info too
def construct_publication_title_name_part_from_id(person_id):
fetch_query = """SELECT preposition, last_name, first_name, translation_id FROM subject WHERE id = %s"""
value_to_insert = (person_id,)
cursor.execute(fetch_query, value_to_insert)
person_sv = cursor.fetchone()
translation_id = person_sv[3]
if translation_id is not None:
field_names = ["preposition", "last_name", "first_name"]
i = 0
for field_name in field_names:
fetch_query = """SELECT text FROM translation_text WHERE translation_id = %s AND field_name = %s"""
values_to_insert = (translation_id, field_names[i])
cursor.execute(fetch_query, values_to_insert)
field_names[i] = cursor.fetchone()
i += 1
person_fi = field_names[0] + field_names[1] + field_names[2]
else:
person_fi = (None, None, None)
person = person_sv + person_fi
title_name_part_swe = construct_swe_name(person, person_id)
title_name_part_fin = construct_fin_name(person, person_id, title_name_part_swe)
return title_name_part_swe, title_name_part_fin, person
# get swe name info for the person chosen
def construct_swe_name(person, person_id):
if person_id is None:
prefix = person[1]
surname = person[2]
forename_letter = person[7]
else:
(prefix, surname, forename_letter, translation_id, prefix_fi, surname_fi, forename_fi) = person
if forename_letter and prefix and surname:
title_name_part_swe = forename_letter + " " + prefix + " " + surname
elif forename_letter and surname:
title_name_part_swe = forename_letter + " " + surname
elif forename_letter:
title_name_part_swe = forename_letter
else:
title_name_part_swe = surname
return title_name_part_swe
# get fin name info for the person chosen
def construct_fin_name(person, person_id, title_name_part_swe):
if person_id is None:
prefix = person[1]
surname = person[2]
surname_fi = person[3]
forename_fi = person[4]
if forename_fi and prefix and surname_fi:
title_name_part_fin = forename_fi + " " + prefix + " " + surname_fi
elif forename_fi and prefix:
title_name_part_fin = forename_fi + " " + prefix + " " + surname
elif forename_fi and surname_fi:
title_name_part_fin = forename_fi + " " + surname_fi
elif forename_fi and surname:
title_name_part_fin = forename_fi + " " + surname
elif forename_fi:
title_name_part_fin = forename_fi
elif surname_fi:
title_name_part_fin = surname_fi
# if there are no Finnish name parts, the Swedish version is to be used
else:
title_name_part_fin = title_name_part_swe
else:
(prefix, surname, forename_letter, translation_id, prefix_fi, surname_fi, forename_fi) = person
if forename_fi and prefix_fi and surname_fi:
title_name_part_fin = forename_fi + " " + prefix_fi + " " + surname_fi
elif forename_fi and surname_fi:
title_name_part_fin = forename_fi + " " + surname_fi
elif forename_fi:
title_name_part_fin = forename_fi
else:
title_name_part_fin = surname_fi
return title_name_part_fin
# populate table translation
def create_translation():
# translation.neutral_text is there in case text.translation_text has gone missing
neutral_text = "No translation found"
insert_query = """INSERT INTO translation(neutral_text) VALUES(%s) RETURNING id"""
value_to_insert = (neutral_text,)
cursor.execute(insert_query, value_to_insert)
translation_id = cursor.fetchone()[0]
return translation_id
# populate table translation_text with swe and fin titles or file paths
# for the publication
def create_translation_text(translation_id, text_swe, text_fin, field_name, table_name):
insert_query = """INSERT INTO translation_text(translation_id, language, text, field_name, table_name) VALUES(%s, %s, %s, %s, %s)"""
values_to_insert_swe = (translation_id, "sv", text_swe, field_name, table_name)
values_to_insert_fin = (translation_id, "fi", text_fin, field_name, table_name)
cursor.execute(insert_query, values_to_insert_swe)
cursor.execute(insert_query, values_to_insert_fin)
# create connection between publication and subject (the receiver of the letter)
def create_event_and_connection(person_legacy_id, person_id, event_connection_type, name_id_dictionary):
insert_query = """INSERT INTO event(type) VALUES(%s) RETURNING id"""
event_type = "published"
value_to_insert = (event_type,)
cursor.execute(insert_query, value_to_insert)
event_id = cursor.fetchone()[0]
insert_query = """INSERT INTO event_connection(subject_id, event_id, type) VALUES(%s, %s, %s)"""
if person_legacy_id:
subject_id = name_id_dictionary[person_legacy_id]
subject_id = int(subject_id)
elif person_id:
# check if this person exists in the db
# if not: replace id with id for "unknown person"
# thus handling e.g. typos in subject_id:s in the csv
# or a test db with different values than the production db
fetch_query = """SELECT id FROM subject WHERE id = %s"""
value_to_insert = (person_id,)
cursor.execute(fetch_query, value_to_insert)
subject_exists = cursor.fetchone()
if subject_exists is None:
print("Person with id " + str(person_id) + " not in db!")
subject_id = 1912
else:
subject_id = int(person_id)
else:
# id for "unknown person"
subject_id = 1912
values_to_insert = (subject_id, event_id, event_connection_type)
cursor.execute(insert_query, values_to_insert)
return event_id
# create connection between publication and event
def create_event_occurrence(publication_id, event_id, event_occurrence_type):
insert_query = """INSERT INTO event_occurrence(type, event_id, publication_id) VALUES(%s, %s, %s)"""
values_to_insert = (event_occurrence_type, event_id, publication_id)
cursor.execute(insert_query, values_to_insert)
# populate table publication_manuscript
def create_publication_manuscript(publication_id, published, manuscript_type, archive_signum, language_for_db, title_swe):
insert_query = """INSERT INTO publication_manuscript(publication_id, published, name, type, archive_signum, original_language) VALUES(%s, %s, %s, %s, %s, %s) RETURNING id"""
values_to_insert = (publication_id, published, title_swe, manuscript_type, archive_signum, language_for_db)
cursor.execute(insert_query, values_to_insert)
manuscript_id = cursor.fetchone()[0]
return manuscript_id
# create a directory
def create_directory(directory):
if not os.path.exists(directory):
os.makedirs(directory)
return directory
# each publication has two XML files, a Swedish and a Finnish one
# if original_language is something else than sv or fi, then
# there's a third file and a file path that goes into publication_manuscript
# otherwise the sv file path goes there
# create files and directories and update translation_text and
# publication_manuscript with file paths
def create_file(directory_path, person, person_id, original_publication_date, original_language, publication_id, translation_id, title_swe, title_fin, manuscript_id):
# files and directories for letters contain the writer's name
if person:
name_part = create_name_part_for_file(person, person_id)
else:
name_part = "X_X"
# files and directories contain the publication's date
original_publication_date = original_publication_date.replace("-", "_")
directory = directory_path + "/" + name_part
person_directory_path = create_directory(directory)
final_directory = person_directory_path + "/" + original_publication_date + "_" + name_part
final_directory_path = create_directory(final_directory)
# if the original language is just Swedish or Finnish:
# there will be two files/file paths for the publication
# if the publication is a manuscript, the Swedish/Finnish file
# will be manuscript file
if original_language[0] == "sv" or original_language[0] == "fi":
file_name = original_publication_date + "_" + name_part + "_sv_" + str(publication_id) + ".xml"
file_path_swe = final_directory_path + "/" + file_name
write_to_file(file_path_swe, title_swe)
file_name = original_publication_date + "_" + name_part + "_fi_" + str(publication_id) + ".xml"
file_path_fin = final_directory_path + "/" + file_name
write_to_file(file_path_fin, title_fin)
add_file_path(translation_id, file_path_swe, file_path_fin)
# update publication_manuscript too
if manuscript_id is not None and original_language[0] == "sv":
update_publication_manuscript_with_file_path(file_path_swe, manuscript_id)
if manuscript_id is not None and original_language[0] == "fi":
update_publication_manuscript_with_file_path(file_path_fin, manuscript_id)
# if the language is foreign:
# there will be three files/file paths for the publication
# the foreign file will be manuscript file
# in the csv, if the publication is partly in a foreign language,
# this has to be the first language in the string containing the languages
else:
file_name = original_publication_date + "_" + name_part + "_" + original_language[0] + "_" + str(publication_id) + ".xml"
file_path_orig = final_directory_path + "/" + file_name
write_to_file(file_path_orig, title_swe)
file_name = original_publication_date + "_" + name_part + "_sv_" + str(publication_id) + ".xml"
file_path_swe = final_directory_path + "/" + file_name
write_to_file(file_path_swe, title_swe)
file_name = original_publication_date + "_" + name_part + "_fi_" + str(publication_id) + ".xml"
file_path_fin = final_directory_path + "/" + file_name
write_to_file(file_path_fin, title_fin)
add_file_path(translation_id, file_path_swe, file_path_fin)
# update publication_manuscript too
update_publication_manuscript_with_file_path(file_path_orig, manuscript_id)
# file and directory names contain a person's (Swedish) name,
# but not in the same order as in the publication title,
# and with certain replacements
def create_name_part_for_file(person, person_id):
if person_id is None:
prefix = person[1]
surname = person[2]
forename_letter = person[7]
elif person_id == 1912:
name_part = "X_X"
return name_part
else:
(prefix, surname, forename_letter, translation_id, prefix_fi, surname_fi, forename_fi) = person
if forename_letter and prefix and surname:
name_part = prefix + "_" + surname + "_" + forename_letter
elif forename_letter and surname:
name_part = surname + "_" + forename_letter
elif forename_letter:
name_part = forename_letter
else:
name_part = surname
name_part = name_part.replace(". ", "_")
name_part = name_part.replace(".", "")
name_part = name_part.replace(" ", "_")
name_part = name_part.replace("-", "_")
name_part = name_part.replace("–", "_")
name_part = re.sub(r",|\?|!|’|»|”|:|;|\(|\)|\[|\]|\'|\"", "", name_part)
name_part = name_part.replace("ç", "c")
name_part = name_part.replace("Ç", "C")
name_part = name_part.replace("é", "e")
name_part = name_part.replace("è", "e")
name_part = name_part.replace("ê", "e")
name_part = name_part.replace("Ê", "E")
name_part = name_part.replace("É", "E")
name_part = name_part.replace("á", "a")
name_part = name_part.replace("à", "a")
name_part = name_part.replace("À", "A")
name_part = name_part.replace("ü", "u")
name_part = name_part.replace("ú", "u")
name_part = name_part.replace("Ü", "U")
name_part = name_part.replace("ï", "i")
name_part = name_part.replace("í", "i")
name_part = name_part.replace("ł", "l")
name_part = name_part.replace("ô", "o")
name_part = name_part.replace("ó", "o")
name_part = name_part.replace("ò", "o")
name_part = name_part.replace("æ", "ae")
name_part = name_part.replace("œ", "oe")
name_part = name_part.replace("ß", "ss")
name_part = name_part.replace("&", "et")
name_part = name_part.replace("ø", "o")
name_part = name_part.replace("Ö", "O")
name_part = name_part.replace("ö", "o")
name_part = name_part.replace("Å", "A")
name_part = name_part.replace("å", "a")
name_part = name_part.replace("Ä", "A")
name_part = name_part.replace("ä", "a")
# shorten long names of files and directories
# otherwise the file path may become too long
if len(name_part) >= 40:
name_part = name_part[0:39]
return name_part
# the XML files contain a template with the publication's title
def content_template():
xml_template = '''
<TEI xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.tei-c.org/ns/1.0" xsi:schemaLocation="">
<teiHeader>
<fileDesc>
<titleStmt>
<title></title>
<respStmt>
<resp/>
<name/>
</respStmt>
</titleStmt>
<publicationStmt>
<publisher>Utgåvan Leo Mechelin / Leo Mechelin -editio</publisher>
</publicationStmt>
<sourceDesc>
<p/>
</sourceDesc>
</fileDesc>
</teiHeader>
<text>
<body xml:space="preserve">
<div type="letter">
</div>
</body>
</text>
</TEI>
'''
return BeautifulSoup(xml_template, "xml")
# create the file and its content
def write_to_file(file_path, title):
with open(file_path, "w", encoding="utf-8-sig") as output_file:
template_soup = content_template()
# insert publication name as title
template_title = template_soup.find("title")
template_title.append(title)
# write to file as string
output_file.write(str(template_soup))
# add file paths to table translation_text
def add_file_path(translation_id, file_path_swe, file_path_fin):
field_name = "original_filename"
table_name = "publication"
create_translation_text(translation_id, file_path_swe, file_path_fin, field_name, table_name)
# update table publication_manuscript
def update_publication_manuscript_with_file_path(file_path, manuscript_id):
update_query = """UPDATE publication_manuscript SET original_filename = %s WHERE id = %s"""
values_to_insert = (file_path, manuscript_id)
cursor.execute(update_query, values_to_insert)
def main():
persons_list = create_list_from_csv("csv/Personregister.csv")
# info about publications of type "sent letter"
sent_letters = create_list_from_csv(CSV_IN)
name_dictionary = read_dict_from_file("dictionaries/name_dictionary.json")
genre_dictionary = read_dict_from_file("dictionaries/genre_dictionary.json")
language_dictionary = read_dict_from_file("dictionaries/language_dictionary.json")
name_id_dictionary = read_dict_from_file("dictionaries/name_id_dictionary.json")
sent_letters_with_id = create_sent_publication(COLLECTION_ID, persons_list, sent_letters, name_dictionary, genre_dictionary, language_dictionary, name_id_dictionary)
write_list_to_csv(sent_letters_with_id, CSV_OUT)
conn_db.close()
cursor.close()
main()
'''
sample input:
1873?;lähetetty kirje;x;;Mechelin, Alexandra;rim;;ruotsi;113 LM/KA;1;1493079741;0395;0395;1886828.KA;;;;;['M:/Faksimiili/Mechelin_4/1493079741/master/0395.jpg'];
sample output:
1873?;lähetetty kirje;x;;Mechelin, Alexandra;rim;;ruotsi;113 LM/KA;1;1493079741;0395;0395;1886828.KA;;;;;['M:/Faksimiili/Mechelin_4/1493079741/master/0395.jpg'];1749;ca 1873 LM–Alexandra Mechelin;
'''